In [2]:
library(data.table)

In [3]:
library(ggplot2)

In [4]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:data.table’:

    between, first, last

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


In [5]:
options(scipen=999)

Sampling registered user revisions to plot out time difference between edits


In [5]:
sample_registered_user_revision_session_data <- data.table(read.table("../../results/wikidata_page_revisions_with_timestamp_edit_types_and_usage/100000_sample_registered_user_revision_session_data_with_header.tsv", header=TRUE, sep="\t"))

In [6]:
sample_registered_user_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_registered_user_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [7]:
sample_registered_user_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_registered_user_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [8]:
sample_registered_user_revision_session_data$time_difference <- as.numeric(sample_registered_user_revision_session_data$updated_timestamp - sample_registered_user_revision_session_data$updated_previous_timestamp)

In [9]:
sample_registered_user_revision_session_data$log_time_difference <- log10(sample_registered_user_revision_session_data$time_difference + 1)

In [10]:
attach(sample_registered_user_revision_session_data)

In [11]:
ggplot(sample_registered_user_revision_session_data[prev_timestamp != 'NULL',],aes(x=log_time_difference)) + geom_histogram(bins=100)



In [12]:
detach(sample_registered_user_revision_session_data)

In [ ]:


In [ ]:


In [ ]:


In [ ]:
# other types

In [66]:
sample_human_revision_session_data <- data.table(read.table("~/Desktop/human_events.tsv", header=TRUE, sep="\t"))

In [67]:
sample_human_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_human_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [68]:
sample_human_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_human_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [69]:
sample_human_revision_session_data$time_difference <- as.numeric(sample_human_revision_session_data$updated_timestamp - sample_human_revision_session_data$updated_previous_timestamp)

In [70]:
sample_human_revision_session_data$log_time_difference <- log10(sample_human_revision_session_data$time_difference + 1)


Warning message in eval(expr, envir, enclos):
“NaNs produced”

In [71]:
attach(sample_human_revision_session_data)


The following objects are masked from sample_human_revision_session_data (pos = 3):

    edit_type, event_index, log_time_difference, prev_timestamp,
    revision_id, session_end, session_events, session_index,
    session_start, time_difference, timestamp,
    updated_previous_timestamp, updated_timestamp, user

The following objects are masked from sample_bot_revision_session_data:

    edit_type, event_index, log_time_difference, prev_timestamp,
    revision_id, session_end, session_events, session_index,
    session_start, time_difference, timestamp,
    updated_previous_timestamp, updated_timestamp, user

The following objects are masked from sample_anon_revision_session_data:

    event_index, log_time_difference, prev_timestamp, revision_id,
    session_end, session_events, session_index, session_start,
    time_difference, timestamp, updated_previous_timestamp,
    updated_timestamp, user


In [91]:
sample_human_revision_session_data_standard_deviation = summarize(group_by(sample_human_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user, session_start), standard_deviation = sd(time_difference))

In [73]:
ggplot(sample_human_revision_session_data_standard_deviation,aes(x=standard_deviation)) + geom_histogram(bins=500)



In [74]:
sample_human_revision_session_data$group = 'human'

In [75]:
detach(sample_human_revision_session_data)

In [39]:
sample_bot_revision_session_data <- data.table(read.table("~/Desktop/bot_events.tsv", header=TRUE, sep="\t"))

In [64]:
summary(sample_bot_revision_session_data)


    edit_type            user         timestamp             
 bot_edit:2386536   Min.   : 7801   Min.   :20130402190200  
                    1st Qu.:39601   1st Qu.:20130501054600  
                    Median :39601   Median :20130511195600  
                    Mean   :39600   Mean   :20130540971100  
                    3rd Qu.:39601   3rd Qu.:20130519054300  
                    Max.   :39601   Max.   :20160410205600  
                                                            
  revision_id               prev_timestamp    session_start           
 Min.   : 18777619   NULL          :     72   Min.   :20130402190200  
 1st Qu.: 33162470   20130408231733:      3   1st Qu.:20130430202900  
 Median : 40080592   20130408231757:      3   Median :20130506174100  
 Mean   : 41181630   20130408232129:      3   Mean   :20130530205600  
 3rd Qu.: 43991769   20130408232202:      3   3rd Qu.:20130506174100  
 Max.   :320689112   20130408232822:      3   Max.   :20160410201700  
                     (Other)       :2386449                           
  session_end             session_index   session_events     event_index     
 Min.   :20130402193900   Min.   : 0.00   Min.   :      1   Min.   :      0  
 1st Qu.:20130503203500   1st Qu.:15.00   1st Qu.: 188794   1st Qu.:  43358  
 Median :20130520223500   Median :16.00   Median : 268659   Median : 153911  
 Mean   :20130545343800   Mean   :18.21   Mean   : 613274   Mean   : 306637  
 3rd Qu.:20130520223500   3rd Qu.:16.00   3rd Qu.:1127804   3rd Qu.: 531169  
 Max.   :20160410205600   Max.   :68.00   Max.   :1127804   Max.   :1127803  
                                                                             
 updated_timestamp             updated_previous_timestamp    time_difference   
 Min.   :2013-04-02 19:02:18   Min.   :2013-04-02 19:02:18   Min.   :   0.000  
 1st Qu.:2013-05-01 05:46:02   1st Qu.:2013-05-01 05:45:59   1st Qu.:   1.000  
 Median :2013-05-11 19:56:40   Median :2013-05-11 19:56:20   Median :   1.000  
 Mean   :2013-05-22 10:25:11   Mean   :2013-05-22 10:20:43   Mean   :   2.112  
 3rd Qu.:2013-05-19 05:42:55   3rd Qu.:2013-05-19 05:42:18   3rd Qu.:   2.000  
 Max.   :2016-04-10 20:56:50   Max.   :2016-04-10 20:56:20   Max.   :3590.000  
                               NA's   :72                    NA's   :72        
 log_time_difference
 Min.   :0.0000     
 1st Qu.:0.3010     
 Median :0.3010     
 Mean   :0.3684     
 3rd Qu.:0.4771     
 Max.   :3.5552     
 NA's   :72         

In [65]:
sample_bot_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_bot_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [41]:
sample_bot_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_bot_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [42]:
sample_bot_revision_session_data$time_difference <- as.numeric(sample_bot_revision_session_data$updated_timestamp - sample_bot_revision_session_data$updated_previous_timestamp)

In [43]:
sample_bot_revision_session_data$log_time_difference <- log10(sample_bot_revision_session_data$time_difference + 1)

In [44]:
attach(sample_bot_revision_session_data)


The following objects are masked from sample_anon_revision_session_data:

    event_index, log_time_difference, prev_timestamp, revision_id,
    session_end, session_events, session_index, session_start,
    time_difference, timestamp, updated_previous_timestamp,
    updated_timestamp, user


In [48]:
sample_bot_revision_session_data_standard_deviation = summarize(group_by(sample_bot_revision_session_data[prev_timestamp != 'NULL' & time_difference >= 0,], user, session_start), standard_deviation = sd(log_time_difference))

In [63]:


In [49]:
ggplot(sample_bot_revision_session_data_standard_deviation,aes(x=standard_deviation)) + geom_histogram(bins=500)


Warning message:
“Removed 2 rows containing non-finite values (stat_bin).”

In [ ]:
detach(sample_bot_revision_session_data)

Sampling anon revisions to plot out time difference between edits


In [76]:
sample_anon_revision_session_data <- data.table(read.table("~/Desktop/revision_session_data.tsv", header=TRUE, sep="\t"))

In [77]:
sample_anon_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_anon_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [78]:
sample_anon_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_anon_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [79]:
sample_anon_revision_session_data$time_difference <- as.numeric(sample_anon_revision_session_data$updated_timestamp - sample_anon_revision_session_data$updated_previous_timestamp)

In [80]:
sample_anon_revision_session_data$log_time_difference <- log10(sample_anon_revision_session_data$time_difference + 1)


Warning message in eval(expr, envir, enclos):
“NaNs produced”

In [81]:
attach(sample_anon_revision_session_data)


The following objects are masked from sample_human_revision_session_data:

    event_index, log_time_difference, prev_timestamp, revision_id,
    session_end, session_events, session_index, session_start,
    time_difference, timestamp, updated_previous_timestamp,
    updated_timestamp, user

The following objects are masked from sample_bot_revision_session_data:

    event_index, log_time_difference, prev_timestamp, revision_id,
    session_end, session_events, session_index, session_start,
    time_difference, timestamp, updated_previous_timestamp,
    updated_timestamp, user

The following objects are masked from sample_anon_revision_session_data (pos = 5):

    event_index, log_time_difference, prev_timestamp, revision_id,
    session_end, session_events, session_index, session_start,
    time_difference, timestamp, updated_previous_timestamp,
    updated_timestamp, user


In [90]:
sample_anon_revision_session_data_standard_deviation = summarize(group_by(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user, session_start), standard_deviation = sd(time_difference))

In [93]:
sample_human_revision_session_data_standard_deviation$group = 'human'
sample_anon_revision_session_data_standard_deviation$group = 'anon'

In [94]:
sample_human_revision_session_data_standard_deviation$user = as.character(sample_human_revision_session_data_standard_deviation$user)

In [100]:
mean(sample_human_revision_session_data$time_difference <=5, na.rm=TRUE)


0.146837883884589

In [95]:
ggplot(rbind(sample_human_revision_session_data_standard_deviation,sample_anon_revision_session_data_standard_deviation),aes(x=standard_deviation)) + facet_wrap(~group) + geom_histogram(bins=500)


Warning message in bind_rows_(x, .id):
“binding character and factor vector, coercing into character vector”

In [14]:
sample_anon_revision_session_data_standard_deviation[sample_anon_revision_session_data_standard_deviation$user == '150.254.210.213',]


usersession_startstandard_deviation
150.254.210.21320130701235102 0.02887786
150.254.210.21320130717091320 0.36545633
150.254.210.21320130829091540 0.13779059
150.254.210.21320130829202421 0.11885195
150.254.210.21320130830070801 0.08932838
150.254.210.21320130831005116 0.01318631

In [15]:
head(sample_anon_revision_session_data_standard_deviation[order(sample_anon_revision_session_data_standard_deviation$standard_deviation),])


usersession_startstandard_deviation
76.8.204.187 20130309173737 0.00000000
91.198.174.211 20121202050457 0.00000000
150.254.210.21320130831005116 0.01318631
112.203.160.14720130623071509 0.01739032
187.189.169.19020161027145445 0.02089213
112.203.160.14720130623054004 0.02694538

In [16]:
sample_anon_revision_session_mean = summarize(group_by(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user), mean_log_time_difference = mean(log_time_difference))

In [17]:
ggplot(sample_anon_revision_session_mean,aes(x=mean_log_time_difference)) + geom_histogram(bins=500)



In [18]:
nrow(sample_anon_revision_session_data[timestamp == prev_timestamp,])


20756

In [19]:
nrow(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & is.na(log_time_difference),])


1

In [105]:
head(sample_anon_revision_session_data)


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference
93.220.76.159 20121029172824 25 NULL 20121029172824 20121029181646 0 2 0 2012-10-29 17:28:24NA NA NA
93.220.76.159 20121029181646 102 20121029172824 20121029172824 20121029181646 0 2 1 2012-10-29 18:16:462012-10-29 17:28:242902 3.462847
193.40.10.178 20121029174206 54 NULL 20121029174206 20121029180514 0 4 0 2012-10-29 17:42:06NA NA NA
193.40.10.178 20121029174358 56 20121029174206 20121029174206 20121029180514 0 4 1 2012-10-29 17:43:582012-10-29 17:42:06 112 2.053078
193.40.10.178 20121029174425 58 20121029174358 20121029174206 20121029180514 0 4 2 2012-10-29 17:44:252012-10-29 17:43:58 27 1.447158
193.40.10.178 20121029180514 71 20121029174425 20121029174206 20121029180514 0 4 3 2012-10-29 18:05:142012-10-29 17:44:251249 3.096910

In [114]:
ggplot(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10,],aes(x=log_time_difference)) +
geom_histogram(bins=100) + 
scale_x_continuous(breaks=log10(c(10,60) + 1))


Warning message:
“Removed 1 rows containing non-finite values (stat_bin).”

In [21]:
# just between 1 and 2
ggplot(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 3,],aes(x=log_time_difference)) + geom_histogram(bins=100)



In [22]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 0 &  log_time_difference == 0,])


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference

In [23]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference == .3010300,])


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference

In [24]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference == .4771213,])


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference

In [30]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 0 &  log_time_difference <= .5,])


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference
127.0.0.1 20121029181849 116 20121029181848 20121029181848 20121029181849 0 18 12 2012-10-29 18:18:492012-10-29 18:18:481 0.3010300
151.41.160.140 20121030221544 12935 20121030221542 20121030221531 20121030221607 0 18 3 2012-10-30 22:15:442012-10-30 22:15:422 0.4771213
151.41.160.140 20121030221551 12941 20121030221549 20121030221531 20121030221607 0 18 5 2012-10-30 22:15:512012-10-30 22:15:492 0.4771213
151.41.160.140 20121030221552 12942 20121030221551 20121030221531 20121030221607 0 18 6 2012-10-30 22:15:522012-10-30 22:15:511 0.3010300
151.41.160.140 20121030221554 12943 20121030221552 20121030221531 20121030221607 0 18 7 2012-10-30 22:15:542012-10-30 22:15:522 0.4771213
151.41.160.140 20121030221555 12944 20121030221554 20121030221531 20121030221607 0 18 8 2012-10-30 22:15:552012-10-30 22:15:541 0.3010300

In [38]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 1 &  log_time_difference <= 1.05,])


updated_previous_timestampusersession_eventsevent_indexsession_endrevision_idsession_indexprev_timestampsession_startupdated_timestamptime_differencelog_time_differencetimestamp
2012-10-30 14:45:3784.98.132.83 17 15 20121030144649 4001 0 20121030144537 20121030143235 2012-10-30 14:45:4710 1.041393 20121030144547
2012-11-01 12:08:27217.255.27.137 21 20 20121101120837 274381 0 20121101120827 20121101115902 2012-11-01 12:08:3710 1.041393 20121101120837
2012-11-04 08:40:5293.146.216.252 20 17 20121104084211 318625 0 20121104084052 20121104083404 2012-11-04 08:41:0210 1.041393 20121104084102
2012-11-04 14:09:3581.249.226.3 144 32 20121104143551 321967 0 20121104140935 20121104140117 2012-11-04 14:09:4510 1.041393 20121104140945
2012-11-04 14:15:1281.249.226.3 144 50 20121104143551 322057 0 20121104141512 20121104140117 2012-11-04 14:15:2210 1.041393 20121104141522
2012-11-04 14:20:0481.249.226.3 144 70 20121104143551 322122 0 20121104142004 20121104140117 2012-11-04 14:20:1410 1.041393 20121104142014
2012-11-04 14:24:5181.249.226.3 144 91 20121104143551 322188 0 20121104142451 20121104140117 2012-11-04 14:25:0110 1.041393 20121104142501
2012-11-04 14:25:2481.249.226.3 144 95 20121104143551 322199 0 20121104142524 20121104140117 2012-11-04 14:25:3410 1.041393 20121104142534
2012-11-04 14:27:0281.249.226.3 144 104 20121104143551 322230 0 20121104142702 20121104140117 2012-11-04 14:27:1210 1.041393 20121104142712
2012-11-04 14:28:2081.249.226.3 144 110 20121104143551 322241 0 20121104142820 20121104140117 2012-11-04 14:28:3010 1.041393 20121104142830
2012-11-04 14:28:3081.249.226.3 144 111 20121104143551 322243 0 20121104142830 20121104140117 2012-11-04 14:28:4010 1.041393 20121104142840
2012-11-04 14:28:5681.249.226.3 144 113 20121104143551 322247 0 20121104142856 20121104140117 2012-11-04 14:29:0610 1.041393 20121104142906
2012-11-04 14:29:5081.249.226.3 144 115 20121104143551 322255 0 20121104142950 20121104140117 2012-11-04 14:30:0010 1.041393 20121104143000
2012-11-04 14:30:3381.249.226.3 144 120 20121104143551 322270 0 20121104143033 20121104140117 2012-11-04 14:30:4310 1.041393 20121104143043
2012-11-04 14:32:5181.249.226.3 144 127 20121104143551 322291 0 20121104143251 20121104140117 2012-11-04 14:33:0110 1.041393 20121104143301
2012-11-06 13:15:09130.88.141.34 16 2 20121106132132 342097 0 20121106131509 20121106131500 2012-11-06 13:15:1910 1.041393 20121106131519
2012-11-06 13:20:03130.88.141.34 16 12 20121106132132 342154 0 20121106132003 20121106131500 2012-11-06 13:20:1310 1.041393 20121106132013
2012-11-08 15:15:3283.206.120.18 25 5 20121108155241 363527 4 20121108151532 20121108150105 2012-11-08 15:15:4210 1.041393 20121108151542
2012-11-08 15:43:2783.206.120.18 25 16 20121108155241 363841 4 20121108154327 20121108150105 2012-11-08 15:43:3710 1.041393 20121108154337
2012-11-09 16:36:07130.88.141.34 59 30 20121109172253 375701 3 20121109163607 20121109162207 2012-11-09 16:36:1710 1.041393 20121109163617
2012-11-09 21:53:3246.255.169.253 17 13 20121109215501 380145 0 20121109215332 20121109214755 2012-11-09 21:53:4210 1.041393 20121109215342
2012-11-11 16:02:0377.186.178.41 10 4 20121111160743 403321 0 20121111160203 20121111160051 2012-11-11 16:02:1310 1.041393 20121111160213
2012-11-12 11:37:20130.88.141.34 549 7 20121112154043 418663 4 20121112113720 20121112113525 2012-11-12 11:37:3010 1.041393 20121112113730
2012-11-12 11:46:10130.88.141.34 549 22 20121112154043 418747 4 20121112114610 20121112113525 2012-11-12 11:46:2010 1.041393 20121112114620
2012-11-12 11:48:31130.88.141.34 549 32 20121112154043 418763 4 20121112114831 20121112113525 2012-11-12 11:48:4110 1.041393 20121112114841
2012-11-12 11:48:45130.88.141.34 549 34 20121112154043 418765 4 20121112114845 20121112113525 2012-11-12 11:48:5510 1.041393 20121112114855
2012-11-12 13:08:03130.88.141.34 549 172 20121112154043 419471 4 20121112130803 20121112113525 2012-11-12 13:08:1310 1.041393 20121112130813
2012-11-12 13:29:04130.88.141.34 549 229 20121112154043 419782 4 20121112132904 20121112113525 2012-11-12 13:29:1410 1.041393 20121112132914
2012-11-12 13:33:16130.88.141.34 549 236 20121112154043 419836 4 20121112133316 20121112113525 2012-11-12 13:33:2610 1.041393 20121112133326
2012-11-12 13:38:20130.88.141.34 549 267 20121112154043 419952 4 20121112133820 20121112113525 2012-11-12 13:38:3010 1.041393 20121112133830
2017-04-26 16:15:5378.55.163.26 54 31 20170426164732 479133977 1 20170426161553 20170426155859 2017-04-26 16:16:0310 1.041393 20170426161603
2017-04-26 16:19:5978.55.163.26 54 43 20170426164732 479134615 1 20170426161959 20170426155859 2017-04-26 16:20:0910 1.041393 20170426162009
2017-04-26 19:06:1585.180.37.197 30 11 20170426193216 479162827 0 20170426190615 20170426185408 2017-04-26 19:06:2510 1.041393 20170426190625
2017-04-26 19:29:0689.166.46.217 12 7 20170426200245 479168127 280 20170426192906 20170426190304 2017-04-26 19:29:1610 1.041393 20170426192916
2017-04-27 20:35:1568.10.122.92 11 3 20170427204101 479464387 1 20170427203515 20170427203428 2017-04-27 20:35:2510 1.041393 20170427203525
2017-04-27 21:24:23173.2.63.115 71 50 20170427215824 479476484 3 20170427212423 20170427191232 2017-04-27 21:24:3310 1.041393 20170427212433
2017-04-27 22:46:3294.223.133.167 11 4 20170427230551 479494086 2 20170427224632 20170427223835 2017-04-27 22:46:4210 1.041393 20170427224642
2017-04-28 02:49:48219.79.126.186 26 1 20170428025650 479507445 0 20170428024948 20170428024948 2017-04-28 02:49:5810 1.041393 20170428024958
2017-04-28 02:55:11219.79.126.186 26 19 20170428025650 479507596 0 20170428025511 20170428024948 2017-04-28 02:55:2110 1.041393 20170428025521
2017-04-28 08:02:3795.161.239.106 16 12 20170428082453 479525963 144 20170428080237 20170428062712 2017-04-28 08:02:4710 1.041393 20170428080247
2017-04-28 15:34:21212.108.4.149 35 3 20170428162603 479616438 14 20170428153421 20170428153223 2017-04-28 15:34:3110 1.041393 20170428153431
2017-04-28 16:50:0924.105.170.133 17 8 20170428171502 479632474 18 20170428165009 20170428164847 2017-04-28 16:50:1910 1.041393 20170428165019
2017-04-29 03:28:02190.230.236.183 29 3 20170429033907 479736485 0 20170429032802 20170429032728 2017-04-29 03:28:1210 1.041393 20170429032812
2017-04-29 13:54:1561.227.52.165 161 78 20170429153200 479868944 5 20170429135415 20170429131506 2017-04-29 13:54:2510 1.041393 20170429135425
2017-04-29 13:58:4061.227.52.165 161 90 20170429153200 479869890 5 20170429135840 20170429131506 2017-04-29 13:58:5010 1.041393 20170429135850
2017-04-30 02:48:23173.2.63.115 27 11 20170430025715 479984567 7 20170430024823 20170430024508 2017-04-30 02:48:3310 1.041393 20170430024833
2017-04-30 14:41:0193.185.19.138 10 5 20170430153604 480220135 1 20170430144101 20170430133225 2017-04-30 14:41:1110 1.041393 20170430144111
2017-04-30 17:10:46106.1.112.8 15 7 20170430171238 480283626 0 20170430171046 20170430170455 2017-04-30 17:10:5610 1.041393 20170430171056
2017-04-30 17:12:05106.1.112.8 15 12 20170430171238 480284135 0 20170430171205 20170430170455 2017-04-30 17:12:1510 1.041393 20170430171215
2017-04-30 17:43:49173.2.63.115 24 7 20170430181527 480295242 9 20170430174349 20170430174159 2017-04-30 17:43:5910 1.041393 20170430174359
2017-04-30 17:44:12173.2.63.115 24 9 20170430181527 480295384 9 20170430174412 20170430174159 2017-04-30 17:44:2210 1.041393 20170430174422
2017-04-30 17:48:58173.2.63.115 24 16 20170430181527 480296857 9 20170430174858 20170430174159 2017-04-30 17:49:0810 1.041393 20170430174908
2017-04-30 21:07:44200.89.248.75 38 35 20170430210909 480352242 26 20170430210744 20170430202726 2017-04-30 21:07:5410 1.041393 20170430210754
2017-05-01 14:01:2161.227.12.81 73 62 20170501140420 480496843 0 20170501140121 20170501133031 2017-05-01 14:01:3110 1.041393 20170501140131
2017-05-01 13:25:5093.185.19.5 13 4 20170501142630 480488897 1 20170501132550 20170501131335 2017-05-01 13:26:0010 1.041393 20170501132600
2017-05-01 22:29:03178.5.166.108 82 13 20170501231614 480587503 1 20170501222903 20170501222353 2017-05-01 22:29:1310 1.041393 20170501222913
2017-05-01 22:29:24178.5.166.108 82 15 20170501231614 480587566 1 20170501222924 20170501222353 2017-05-01 22:29:3410 1.041393 20170501222934
2017-05-01 22:31:34178.5.166.108 82 20 20170501231614 480588027 1 20170501223134 20170501222353 2017-05-01 22:31:4410 1.041393 20170501223144
2017-05-02 00:58:22178.12.150.90 10 1 20170502010814 480620174 0 20170502005822 20170502005822 2017-05-02 00:58:3210 1.041393 20170502005832
2017-05-02 01:04:22178.12.150.90 10 6 20170502010814 480622315 0 20170502010422 20170502005822 2017-05-02 01:04:3210 1.041393 20170502010432

In [37]:
sample(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 1.31 &  log_time_difference <= 1.325,],10)


event_indexupdated_previous_timestamprevision_idtimestampprev_timestamptime_differencesession_startlog_time_differencesession_eventsupdated_timestamp
19 2012-10-30 18:55:14 8429 20121030185534 20121030185514 20 20121030183050 1.322219 30 2012-10-30 18:55:34
15 2012-10-31 12:55:04135422 20121031125524 20121031125504 20 20121031124637 1.322219 22 2012-10-31 12:55:24
20 2012-10-31 22:04:25268471 20121031220445 20121031220425 20 20121031213746 1.322219 34 2012-10-31 22:04:45
3 2012-11-01 11:59:53274223 20121101120013 20121101115953 20 20121101115902 1.322219 21 2012-11-01 12:00:13
19 2012-11-01 12:08:07274379 20121101120827 20121101120807 20 20121101115902 1.322219 21 2012-11-01 12:08:27
9 2012-11-02 15:23:35294319 20121102152355 20121102152335 20 20121102151029 1.322219 11 2012-11-02 15:23:55
2 2012-11-03 05:39:15305913 20121103053935 20121103053915 20 20121103053856 1.322219 12 2012-11-03 05:39:35
10 2012-11-04 14:03:10321874 20121104140330 20121104140310 20 20121104140117 1.322219 144 2012-11-04 14:03:30
17 2012-11-04 14:05:33321897 20121104140553 20121104140533 20 20121104140117 1.322219 144 2012-11-04 14:05:53
63 2012-11-04 14:18:25322102 20121104141845 20121104141825 20 20121104140117 1.322219 144 2012-11-04 14:18:45
26 2012-11-06 15:15:01343048 20121106151521 20121106151501 20 20121106145516 1.322219 52 2012-11-06 15:15:21
39 2012-11-09 16:40:26375806 20121109164046 20121109164026 20 20121109162207 1.322219 59 2012-11-09 16:40:46
5 2012-11-12 11:36:57418659 20121112113717 20121112113657 20 20121112113525 1.322219 549 2012-11-12 11:37:17
68 2012-11-12 11:56:40418841 20121112115700 20121112115640 20 20121112113525 1.322219 549 2012-11-12 11:57:00
86 2012-11-12 12:00:12418868 20121112120032 20121112120012 20 20121112113525 1.322219 549 2012-11-12 12:00:32
103 2012-11-12 12:04:34418902 20121112120454 20121112120434 20 20121112113525 1.322219 549 2012-11-12 12:04:54
153 2012-11-12 12:12:32418976 20121112121252 20121112121232 20 20121112113525 1.322219 549 2012-11-12 12:12:52
237 2012-11-12 13:33:26419843 20121112133346 20121112133326 20 20121112113525 1.322219 549 2012-11-12 13:33:46
468 2012-11-12 15:07:20421726 20121112150740 20121112150720 20 20121112113525 1.322219 549 2012-11-12 15:07:40
42 2012-11-14 14:11:42452422 20121114141202 20121114141142 20 20121114131427 1.322219 46 2012-11-14 14:12:02
9 2012-11-14 16:12:35453391 20121114161255 20121114161235 20 20121114154018 1.322219 45 2012-11-14 16:12:55
10 2012-11-14 16:12:55453394 20121114161315 20121114161255 20 20121114154018 1.322219 45 2012-11-14 16:13:15
19 2012-11-14 18:35:40454646 20121114183600 20121114183540 20 20121114181019 1.322219 82 2012-11-14 18:36:00
6 2012-11-15 15:04:17460346 20121115150437 20121115150417 20 20121115150300 1.322219 99 2012-11-15 15:04:37
17 2012-11-15 15:41:13460495 20121115154133 20121115154113 20 20121115150300 1.322219 99 2012-11-15 15:41:33
65 2012-11-15 16:43:31460948 20121115164351 20121115164331 20 20121115150300 1.322219 99 2012-11-15 16:43:51
97 2012-11-15 17:11:52461192 20121115171212 20121115171152 20 20121115150300 1.322219 99 2012-11-15 17:12:12
27 2012-11-23 11:52:42547545 20121123115302 20121123115242 20 20121123114050 1.322219 52 2012-11-23 11:53:02
40 2012-11-23 12:51:17548227 20121123125137 20121123125117 20 20121123114050 1.322219 52 2012-11-23 12:51:37
33 2012-11-23 19:42:10552773 20121123194230 20121123194210 20 20121123191834 1.322219 148 2012-11-23 19:42:30
33 2017-04-25 23:19:24479006512 20170425231944 20170425231924 20 20170425224534 1.322219 45 2017-04-25 23:19:44
41 2017-04-25 23:21:28479006705 20170425232148 20170425232128 20 20170425224534 1.322219 45 2017-04-25 23:21:48
16 2017-04-26 01:37:02479015338 20170426013722 20170426013702 20 20170426011033 1.322219 117 2017-04-26 01:37:22
26 2017-04-26 01:47:19479015608 20170426014739 20170426014719 20 20170426011033 1.322219 117 2017-04-26 01:47:39
9 2017-04-26 07:31:44479036911 20170426073204 20170426073144 20 20170426071733 1.322219 31 2017-04-26 07:32:04
22 2017-04-26 12:25:05479092417 20170426122525 20170426122505 20 20170426121813 1.322219 70 2017-04-26 12:25:25
30 2017-04-26 13:50:21479105947 20170426135041 20170426135021 20 20170426131124 1.322219 36 2017-04-26 13:50:41
1 2017-04-26 16:30:26479136267 20170426163046 20170426163026 20 20170426163026 1.322219 27 2017-04-26 16:30:46
2 2017-04-26 19:13:55479164616 20170426191415 20170426191355 20 20170426190304 1.322219 12 2017-04-26 19:14:15
11 2017-04-27 11:24:20479310349 20170427112440 20170427112420 20 20170427111002 1.322219 16 2017-04-27 11:24:40
13 2017-04-27 11:25:34479310614 20170427112554 20170427112534 20 20170427111002 1.322219 16 2017-04-27 11:25:54
4 2017-04-27 13:38:53479341346 20170427133913 20170427133853 20 20170427133739 1.322219 26 2017-04-27 13:39:13
13 2017-04-27 13:44:40479342776 20170427134500 20170427134440 20 20170427133739 1.322219 26 2017-04-27 13:45:00
22 2017-04-27 13:48:38479344033 20170427134858 20170427134838 20 20170427133739 1.322219 26 2017-04-27 13:48:58
24 2017-04-27 13:49:29479344279 20170427134949 20170427134929 20 20170427133739 1.322219 26 2017-04-27 13:49:49
47 2017-04-27 20:57:05479469135 20170427205725 20170427205705 20 20170427191232 1.322219 71 2017-04-27 20:57:25
19 2017-04-27 21:58:47479483179 20170427215907 20170427215847 20 20170427211219 1.322219 26 2017-04-27 21:59:07
5 2017-04-28 07:53:33479525428 20170428075353 20170428075333 20 20170428075025 1.322219 11 2017-04-28 07:53:53
57 2017-04-29 07:16:04479764136 20170429071624 20170429071604 20 20170429065131 1.322219 70 2017-04-29 07:16:24
12 2017-04-30 11:30:18480136727 20170430113038 20170430113018 20 20170430111829 1.322219 35 2017-04-30 11:30:38
53 2017-04-30 14:22:24480212316 20170430142244 20170430142224 20 20170430082425 1.322219 64 2017-04-30 14:22:44
2 2017-04-30 17:42:15480294774 20170430174235 20170430174215 20 20170430174159 1.322219 24 2017-04-30 17:42:35
14 2017-04-30 17:48:27480296746 20170430174847 20170430174827 20 20170430174159 1.322219 24 2017-04-30 17:48:47
18 2017-04-30 20:33:08480344977 20170430203328 20170430203308 20 20170430201316 1.322219 30 2017-04-30 20:33:28
8 2017-04-30 20:36:11480345521 20170430203631 20170430203611 20 20170430202726 1.322219 38 2017-04-30 20:36:31
16 2017-04-30 20:42:11480346577 20170430204231 20170430204211 20 20170430202726 1.322219 38 2017-04-30 20:42:31
15 2017-04-30 22:38:44480369128 20170430223904 20170430223844 20 20170430221551 1.322219 18 2017-04-30 22:39:04
20 2017-05-01 09:12:43480455401 20170501091303 20170501091243 20 20170501062828 1.322219 105 2017-05-01 09:13:03
71 2017-05-01 13:15:35480486583 20170501131555 20170501131535 20 20170501062828 1.322219 105 2017-05-01 13:15:55
97 2017-05-01 14:19:15480499698 20170501141935 20170501141915 20 20170501062828 1.322219 105 2017-05-01 14:19:35

In [28]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 3,])


usertimestamprevision_idprev_timestampsession_startsession_endsession_indexsession_eventsevent_indexupdated_timestampupdated_previous_timestamptime_differencelog_time_difference
91.86.14.28 20121030150809 4231 20121030144223 20121030144223 20121030151457 0 11 1 2012-10-30 15:08:092012-10-30 14:42:231546 3.189490
88.153.45.108 20121030195619 9788 20121030191416 20121030183050 20121030195713 1 30 27 2012-10-30 19:56:192012-10-30 19:14:162523 3.402089
93.220.68.150 20121030220909 12731 20121030214544 20121030213050 20121030221017 1 11 9 2012-10-30 22:09:092012-10-30 21:45:441405 3.147985
93.220.101.47 20121106171027 344138 20121106164653 20121106164620 20121106171246 0 13 4 2012-11-06 17:10:272012-11-06 16:46:531414 3.150756
83.206.120.18 20121107175156 356133 20121107171149 20121107165853 20121107175756 3 16 12 2012-11-07 17:51:562012-11-07 17:11:492407 3.381656
130.88.141.34 20121109171610 376234 20121109165425 20121109162207 20121109172253 3 59 51 2012-11-09 17:16:102012-11-09 16:54:251305 3.115943

In [ ]:


In [28]:
detach(sample_anon_revision_session_data)

Registered user event lengths


In [29]:
registered_user_mwsessions_results <- data.table(read.table("../../results/wikidata_page_revisions_with_timestamp_edit_types_and_usage/registered_user_session_data.tsv", header=TRUE, sep="\t"))

In [30]:
registered_user_mwsessions_results$start_time <- as.POSIXct(as.character(registered_user_mwsessions_results$start), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [31]:
registered_user_mwsessions_results$end_time <- as.POSIXct(as.character(registered_user_mwsessions_results$end), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [32]:
registered_user_mwsessions_results$time_difference <- as.numeric(registered_user_mwsessions_results$end_time - registered_user_mwsessions_results$start_time)

In [33]:
registered_user_mwsessions_results$log_time_difference <- log10(registered_user_mwsessions_results$time_difference + 1)


Warning message in eval(expr, envir, enclos):
“NaNs produced”

In [34]:
nrow(registered_user_mwsessions_results[time_difference < 0,])


44

In [35]:
attach(registered_user_mwsessions_results)

In [36]:
ggplot(registered_user_mwsessions_results[,
list(n=length(time_difference), prop=mean(time_difference >= 60*60*3)),
by=list(date=as.Date(start_time))], aes(x=date, y=prop)) + geom_line()



In [37]:
ggplot(registered_user_mwsessions_results,aes(x=log_time_difference)) + geom_histogram(bins=100)


Warning message:
“Removed 44 rows containing non-finite values (stat_bin).”

In [38]:
detach(registered_user_mwsessions_results)

Anon event lengths


In [39]:
anon_mwsessions_results <- data.table(read.table("~/Desktop/temp", header=TRUE, sep="\t"))

In [40]:
anon_mwsessions_results$start_time <- as.POSIXct(as.character(anon_mwsessions_results$start), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [41]:
anon_mwsessions_results$end_time <- as.POSIXct(as.character(anon_mwsessions_results$end), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [42]:
anon_mwsessions_results$time_difference <- as.numeric(anon_mwsessions_results$end_time - anon_mwsessions_results$start_time)

In [43]:
anon_mwsessions_results$log_time_difference <- log10(anon_mwsessions_results$time_difference + 1)


Warning message in eval(expr, envir, enclos):
“NaNs produced”

In [44]:
nrow(anon_mwsessions_results[time_difference < 0,])


1

In [45]:
attach(anon_mwsessions_results)

In [46]:
ggplot(anon_mwsessions_results[,
list(n=length(time_difference), prop=mean(time_difference >= 60*60*3)),
by=list(date=as.Date(start_time))], aes(x=date, y=prop)) + geom_line()



In [47]:
ggplot(anon_mwsessions_results,aes(x=log_time_difference)) + geom_histogram(bins=100)


Warning message:
“Removed 1 rows containing non-finite values (stat_bin).”

Anon event statistics


In [48]:
max(events)


11017

In [49]:
nrow(anon_mwsessions_results)


633128

In [50]:
sessions_with_more_than_10_events <- subset(anon_mwsessions_results, events==1)

In [51]:
nrow(sessions_with_more_than_10_events)


439896

In [52]:
sessions_with_more_than_10_events <- subset(anon_mwsessions_results, events>10)

In [53]:
nrow(sessions_with_more_than_10_events)


16272

In [54]:
sessions_with_more_than_20_events <- subset(anon_mwsessions_results, events>20)

In [55]:
nrow(sessions_with_more_than_20_events)


7097

In [56]:
sessions_with_more_than_50_events <- subset(anon_mwsessions_results, events>50)

In [57]:
nrow(sessions_with_more_than_50_events)


2072

In [58]:
sessions_with_more_than_100_events <- subset(anon_mwsessions_results, events>100)

In [59]:
nrow(sessions_with_more_than_100_events)


736

In [60]:
sum(anon_mwsessions_results$events)/sum(anon_mwsessions_results$events)


1

In [61]:
sum(anon_mwsessions_results$events)


1572831

In [62]:
sorted_anon_mwsessions_results <- anon_mwsessions_results[order(-events),]

In [63]:
head(sorted_anon_mwsessions_results, n=10)


userstartendindexeventsstart_timeend_timetime_differencelog_time_difference
10.68.17.174 20141123042702 20141126071956 40 11017 2014-11-23 04:27:022014-11-26 07:19:56269574 5.430680
72.29.167.158 20130622114232 20130623073344 0 6971 2013-06-22 11:42:322013-06-23 07:33:44 71472 4.854142
185.54.115.189 20151112211127 20151113081803 0 3272 2015-11-12 21:11:272015-11-13 08:18:03 39996 4.602027
150.254.210.213 20130830070801 20130830235054 4 2591 2013-08-30 07:08:012013-08-30 23:50:54 60173 4.779409
89.122.248.53 20150821131251 20150824213721 4 2560 2015-08-21 13:12:512015-08-24 21:37:21289470 5.461605
78.90.0.177 20150812232601 20150813050103 4 2538 2015-08-12 23:26:012015-08-13 05:01:03 20102 4.303261
87.170.223.74 20160223101534 20160224152634 1 2250 2016-02-23 10:15:342016-02-24 15:26:34105060 5.021442
10.68.17.61 20141110192855 20141111003652 0 2214 2014-11-10 19:28:552014-11-11 00:36:52 18477 4.266655
150.254.210.213 20130701235102 20130702114051 0 2121 2013-07-01 23:51:022013-07-02 11:40:51 42589 4.629308
10.68.17.174 20141122054826 20141122235430 38 1865 2014-11-22 05:48:262014-11-22 23:54:30 65164 4.814014

In [64]:
sorted_anon_mwsessions_results[1,]$start


20141123042702

In [65]:
sorted_anon_mwsessions_results[1,]$end


20141126071956

In [66]:
sorted_anon_mwsessions_results[3,]$start


20151112211127

In [67]:
sorted_anon_mwsessions_results[3,]$end


20151113081803

In [68]:
data.table(subset(anon_mwsessions_results, user=='54.67.94.64'))[order(-events),]


userstartendindexeventsstart_timeend_timetime_differencelog_time_difference
54.67.94.64 20160409102426 20160409111917 64 7 2016-04-09 10:24:262016-04-09 11:19:173291 3.517460
54.67.94.64 20160531211027 20160531215028 103 5 2016-05-31 21:10:272016-05-31 21:50:282401 3.380573
54.67.94.64 20160601044815 20160601054316 104 4 2016-06-01 04:48:152016-06-01 05:43:163301 3.518777
54.67.94.64 20160826215757 20160826220355 154 4 2016-08-26 21:57:572016-08-26 22:03:55 358 2.555094
54.67.94.64 20160323045408 20160323054320 46 3 2016-03-23 04:54:082016-03-23 05:43:202952 3.470263
54.67.94.64 20160412043249 20160412044031 70 3 2016-04-12 04:32:492016-04-12 04:40:31 462 2.665581
54.67.94.64 20161213083038 20161213091425 220 3 2016-12-13 08:30:382016-12-13 09:14:252627 3.419625
54.67.94.64 20170122030937 20170122031758 235 3 2017-01-22 03:09:372017-01-22 03:17:58 501 2.700704
54.67.94.64 20160218074057 20160218083827 0 2 2016-02-18 07:40:572016-02-18 08:38:273450 3.537945
54.67.94.64 20160219054945 20160219062740 2 2 2016-02-19 05:49:452016-02-19 06:27:402275 3.357172
54.67.94.64 20160225135934 20160225143136 12 2 2016-02-25 13:59:342016-02-25 14:31:361922 3.283979
54.67.94.64 20160309045642 20160309053939 27 2 2016-03-09 04:56:422016-03-09 05:39:392577 3.411283
54.67.94.64 20160316081721 20160316082931 39 2 2016-03-16 08:17:212016-03-16 08:29:31 730 2.863917
54.67.94.64 20160316172132 20160316175524 40 2 2016-03-16 17:21:322016-03-16 17:55:242032 3.308137
54.67.94.64 20160417124954 20160417131436 73 2 2016-04-17 12:49:542016-04-17 13:14:361482 3.171141
54.67.94.64 20160527044822 20160527052933 100 2 2016-05-27 04:48:222016-05-27 05:29:332471 3.393048
54.67.94.64 20160616124818 20160616130006 113 2 2016-06-16 12:48:182016-06-16 13:00:06 708 2.850646
54.67.94.64 20160716071001 20160716074147 132 2 2016-07-16 07:10:012016-07-16 07:41:471906 3.280351
54.67.94.64 20160822232900 20160822232945 152 2 2016-08-22 23:29:002016-08-22 23:29:45 45 1.662758
54.67.94.64 20160831092210 20160831093609 159 2 2016-08-31 09:22:102016-08-31 09:36:09 839 2.924279
54.67.94.64 20160906185954 20160906190210 167 2 2016-09-06 18:59:542016-09-06 19:02:10 136 2.136721
54.67.94.64 20160919005112 20160919005701 173 2 2016-09-19 00:51:122016-09-19 00:57:01 349 2.544068
54.67.94.64 20161115164050 20161115170926 202 2 2016-11-15 16:40:502016-11-15 17:09:261716 3.234770
54.67.94.64 20170116120122 20170116120904 232 2 2017-01-16 12:01:222017-01-16 12:09:04 462 2.665581
54.67.94.64 20160218232400 20160218232400 1 1 2016-02-18 23:24:002016-02-18 23:24:00 0 0.000000
54.67.94.64 20160222031302 20160222031302 3 1 2016-02-22 03:13:022016-02-22 03:13:02 0 0.000000
54.67.94.64 20160222120053 20160222120053 4 1 2016-02-22 12:00:532016-02-22 12:00:53 0 0.000000
54.67.94.64 20160222184505 20160222184505 5 1 2016-02-22 18:45:052016-02-22 18:45:05 0 0.000000
54.67.94.64 20160223002833 20160223002833 6 1 2016-02-23 00:28:332016-02-23 00:28:33 0 0.000000
54.67.94.64 20160223123032 20160223123032 7 1 2016-02-23 12:30:322016-02-23 12:30:32 0 0.000000
54.67.94.64 20161118012509 20161118012509 207 1 2016-11-18 01:25:092016-11-18 01:25:090 0
54.67.94.64 20161118095046 20161118095046 208 1 2016-11-18 09:50:462016-11-18 09:50:460 0
54.67.94.64 20161122023640 20161122023640 209 1 2016-11-22 02:36:402016-11-22 02:36:400 0
54.67.94.64 20161125063409 20161125063409 210 1 2016-11-25 06:34:092016-11-25 06:34:090 0
54.67.94.64 20161126184445 20161126184445 211 1 2016-11-26 18:44:452016-11-26 18:44:450 0
54.67.94.64 20161128183434 20161128183434 212 1 2016-11-28 18:34:342016-11-28 18:34:340 0
54.67.94.64 20161129164513 20161129164513 213 1 2016-11-29 16:45:132016-11-29 16:45:130 0
54.67.94.64 20161201214107 20161201214107 214 1 2016-12-01 21:41:072016-12-01 21:41:070 0
54.67.94.64 20161202065227 20161202065227 215 1 2016-12-02 06:52:272016-12-02 06:52:270 0
54.67.94.64 20161205160823 20161205160823 216 1 2016-12-05 16:08:232016-12-05 16:08:230 0
54.67.94.64 20161206071045 20161206071045 217 1 2016-12-06 07:10:452016-12-06 07:10:450 0
54.67.94.64 20161209062248 20161209062248 218 1 2016-12-09 06:22:482016-12-09 06:22:480 0
54.67.94.64 20161212055437 20161212055437 219 1 2016-12-12 05:54:372016-12-12 05:54:370 0
54.67.94.64 20161214103129 20161214103129 221 1 2016-12-14 10:31:292016-12-14 10:31:290 0
54.67.94.64 20161214114347 20161214114347 222 1 2016-12-14 11:43:472016-12-14 11:43:470 0
54.67.94.64 20161223121248 20161223121248 223 1 2016-12-23 12:12:482016-12-23 12:12:480 0
54.67.94.64 20161227085155 20161227085155 224 1 2016-12-27 08:51:552016-12-27 08:51:550 0
54.67.94.64 20161228161033 20161228161033 225 1 2016-12-28 16:10:332016-12-28 16:10:330 0
54.67.94.64 20170103044539 20170103044539 226 1 2017-01-03 04:45:392017-01-03 04:45:390 0
54.67.94.64 20170103202910 20170103202910 227 1 2017-01-03 20:29:102017-01-03 20:29:100 0
54.67.94.64 20170107074824 20170107074824 228 1 2017-01-07 07:48:242017-01-07 07:48:240 0
54.67.94.64 20170110212056 20170110212056 229 1 2017-01-10 21:20:562017-01-10 21:20:560 0
54.67.94.64 20170112085036 20170112085036 230 1 2017-01-12 08:50:362017-01-12 08:50:360 0
54.67.94.64 20170113024445 20170113024445 231 1 2017-01-13 02:44:452017-01-13 02:44:450 0
54.67.94.64 20170119041131 20170119041131 233 1 2017-01-19 04:11:312017-01-19 04:11:310 0
54.67.94.64 20170119103946 20170119103946 234 1 2017-01-19 10:39:462017-01-19 10:39:460 0
54.67.94.64 20170125074200 20170125074200 236 1 2017-01-25 07:42:002017-01-25 07:42:000 0
54.67.94.64 20170126184418 20170126184418 237 1 2017-01-26 18:44:182017-01-26 18:44:180 0
54.67.94.64 20170130145846 20170130145846 238 1 2017-01-30 14:58:462017-01-30 14:58:460 0
54.67.94.64 20170130162443 20170130162443 239 1 2017-01-30 16:24:432017-01-30 16:24:430 0

In [69]:
detach(anon_mwsessions_results)

Misalignment bucketing by event by event length


In [70]:
anon_revision_session_data <- data.table(read.table("~/Desktop/revision_session_data.tsv", header=TRUE, sep="\t"))

In [71]:
anon_revision_alignment <- data.table(read.table("../../results/sql_queries/misalignment_and_edits/anon_revision_alignment.tsv", header=FALSE, sep="\t"))

In [72]:
colnames(anon_revision_alignment) <- c('entity_id','revision_id','revision_user','quality_class', 'views_class')

In [73]:
anon_revision_session_data_and_alignment <- merge(anon_revision_session_data, anon_revision_alignment, by = "revision_id")

In [74]:
anon_revision_session_data_and_alignment$start_time <- as.POSIXct(as.character(anon_revision_session_data_and_alignment$session_start), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [75]:
anon_revision_session_data_and_alignment$end_time <- as.POSIXct(as.character(anon_revision_session_data_and_alignment$session_end), format='%Y%m%d%H%M%S', origin='1970-01-01')

In [76]:
anon_revision_session_data_and_alignment$time_difference <- anon_revision_session_data_and_alignment$end_time - anon_revision_session_data_and_alignment$start_time

less than 10


In [77]:
less_than_10_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference < 10,], user, session_start), number_of_aligned_revisions_bucket_1 = n())

In [78]:
less_than_10_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference < 10,], user, session_start), number_of_misaligned_revisions_bucket_1 = n())

In [79]:
less_than_10 <- merge(less_than_10_aligned, less_than_10_misaligned)

In [80]:
less_than_10$proportion_aligned_bucket_1 = less_than_10$number_of_aligned_revisions/(less_than_10$number_of_misaligned_revisions + less_than_10$number_of_aligned_revisions)

between 10 and 99 seconds time difference bucket


In [81]:
from_10_to_99_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 10 & time_difference < 99,], user, session_start), number_of_aligned_revisions_bucket_2 = n())

In [82]:
from_10_to_99_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 10 & time_difference < 99,], user, session_start), number_of_misaligned_revisions_bucket_2 = n())

In [83]:
from_10_to_99 <- merge(from_10_to_99_aligned, from_10_to_99_misaligned)

In [84]:
from_10_to_99$proportion_aligned_bucket_2 = from_10_to_99$number_of_aligned_revisions/(from_10_to_99$number_of_misaligned_revisions + from_10_to_99$number_of_aligned_revisions)

between 100 and 999 seconds time difference bucket


In [85]:
from_100_to_999_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 100 & time_difference < 999,], user, session_start), number_of_aligned_revisions_bucket_3 = n())

In [86]:
from_100_to_999_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 100 & time_difference < 999,], user, session_start), number_of_misaligned_revisions_bucket_3 = n())

In [87]:
from_100_to_999 <- merge(from_100_to_999_aligned, from_100_to_999_misaligned)

In [88]:
from_100_to_999$proportion_aligned_bucket_3 = from_100_to_999$number_of_aligned_revisions/(from_100_to_999$number_of_misaligned_revisions + from_100_to_999$number_of_aligned_revisions)

between 1000 and 9999 seconds time difference bucket


In [89]:
from_1000_to_9999_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 1000 & time_difference < 9999,], user, session_start), number_of_aligned_revisions_bucket_4 = n())

In [90]:
from_1000_to_9999_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 1000 & time_difference < 9999,], user, session_start), number_of_misaligned_revisions_bucket_4 = n())

In [91]:
from_1000_to_9999 <- merge(from_1000_to_9999_aligned, from_1000_to_9999_misaligned)

In [92]:
from_1000_to_9999$proportion_aligned_bucket_4 = from_1000_to_9999$number_of_aligned_revisions/(from_1000_to_9999$number_of_misaligned_revisions + from_1000_to_9999$number_of_aligned_revisions)

greater than or equal 10000 seconds time difference bucket


In [93]:
greater_than_or_equal_10000_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference > 10000,], user, session_start), number_of_aligned_revisions_bucket_5 = n())

In [94]:
greater_than_or_equal_10000_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference > 10000,], user, session_start), number_of_misaligned_revisions_bucket_5 = n())

In [95]:
greater_than_or_equal_10000 <- merge(greater_than_or_equal_10000_aligned, greater_than_or_equal_10000_misaligned)

In [96]:
greater_than_or_equal_10000$proportion_aligned_bucket_5 = greater_than_or_equal_10000$number_of_aligned_revisions/(greater_than_or_equal_10000$number_of_misaligned_revisions + greater_than_or_equal_10000$number_of_aligned_revisions)

In [97]:
head(greater_than_or_equal_10000)


usersession_startnumber_of_aligned_revisions_bucket_5number_of_misaligned_revisions_bucket_5proportion_aligned_bucket_5
10.4.1.102 20130820110544 5 398 0.01240695
10.4.1.125 20130916104144381 323 0.54119318
10.4.1.65 20140225000014965 721 0.57236062
10.64.0.127 20130311224842162 413 0.28173913
10.64.0.127 20130527091350 66 108 0.37931034
10.68.16.133 20160311194108690 226 0.75327511

In [98]:
anon_alignment_buckets = data.table(alignment_means = c(mean(less_than_10$proportion_aligned_bucket_1),
                                                     mean(from_10_to_99$proportion_aligned_bucket_2),
                                                     mean(from_100_to_999$proportion_aligned_bucket_3),
                                                     mean(from_1000_to_9999$proportion_aligned_bucket_4),
                                                     mean(greater_than_or_equal_10000$proportion_aligned_bucket_5)))

In [99]:
head(anon_alignment_buckets)


alignment_means
0.5041298
0.4966707
0.4820898
0.4669794
0.4483662

In [ ]:


In [ ]:


In [ ]: